library(tidyverse)
library(DESeq2)
library(factoextra)
library(ggplot2)
library(ggrepel)
library(EnsDb.Hsapiens.v75)
library(clusterProfiler)
library(DOSE)
library(msigdbr)
library(enrichplot)
library(org.Hs.eg.db) 
prep_vulcano_plot_data = function(df, pval=0.05, fc=1, bm=25){
  df$diffexpressed = "NO"
  df$diffexpressed[df$log2FoldChange > 1 & df$padj < pval & df$baseMean > bm] = "UP"
  df$diffexpressed[df$log2FoldChange < -1 & df$padj < pval & df$baseMean > bm] = "DOWN"
  
  return(df)
}

vulcano_plot = function(df, title, log2fc = 1, pval=0.05, label_top_n = 20){
  p = df %>%
    rownames_to_column() %>%
    mutate(SYMBOL = ifelse(padj > (df %>% dplyr::select(padj) %>% top_n(-label_top_n) %>% top_n(1) %>% unique() %>% as.numeric()), NA, SYMBOL)) %>%
    ggplot(aes(x=log2FoldChange, y=-log10(padj), col=diffexpressed, label=SYMBOL)) + 
    geom_point(size=2, alpha=0.7) +
    scale_color_manual(values=c("#067BC2", "black", "#F37748"), 
                       name="Change", 
                       labels=c("Down", "No Change", "Up")) +
    geom_vline(xintercept=c(-log2fc, log2fc), col="red") +
    geom_hline(yintercept=-log10(pval), col="red") +
    geom_text_repel(show.legend = F, box.padding = 0.5, max.overlaps = Inf) +
    theme_bw() +
  theme(text = element_text(size = 25), 
        axis.line = element_line(size = 0.5), 
        panel.border = element_blank(), 
        panel.grid = element_blank(), 
        strip.background = element_blank(), 
        legend.title = element_blank()) +
    ggtitle(paste(title, " (p-val=", pval, ")", sep="")) +
    xlab("log2 Fold Change") +
    ylab("-log10 Adjusted p-Value")

  return(p)
}


run_GSEA = function(gene_list, pval=0.1, cat, subcat=NA, minset=10){

  set.seed(321)

  #TERM2GENE
  term2gene = msigdbr(species = "human", category = cat, subcategory = subcat) %>%          
              dplyr::select(c("gs_name", "human_ensembl_gene")) %>%
              mutate(across('gs_name', str_replace, 'GOBP_', ''))

  #run gsea
  gse = GSEA(gene_list,
             minGSSize = minset,
             maxGSSize = 500,
             pvalueCutoff = pval,
             pAdjustMethod = "BH",
             TERM2GENE = term2gene,
             verbose = TRUE,
             seed = TRUE)

  gse=setReadable(gse,
                  OrgDb = org.Hs.eg.db,
                  keyType = "ENSEMBL")
  return(gse)
}

create_gene_list = function(df, pval=0.05, fc=1, bm=100, direction="BOTH"){

  #create gene list with entrez ID as rownmanes and fold change as values
  tmp_gene_list = df %>%
    filter(!is.na(padj)) %>%
    dplyr::select(log2FoldChange, ID, padj, baseMean) %>%
    filter(padj <= pval & (log2FoldChange >= fc | log2FoldChange <= fc) & baseMean >= bm) %>%
    separate(ID, into=c("ID", "Symbol"), sep=":") %>%
    na.omit() %>%
    arrange(desc(log2FoldChange)) %>%
    dplyr::select(-padj)

  if(direction == "UP"){
    tmp_gene_list = tmp_gene_list %>%
      filter(log2FoldChange > 0)
  }
  else if(direction == "DOWN"){
    tmp_gene_list = tmp_gene_list %>%
      filter(log2FoldChange < 0)
  }

  gene_list = tmp_gene_list$log2FoldChange
  names(gene_list) = tmp_gene_list$ID

  return(gene_list)
}
PATH_DATA = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/patient_triceps/data/osteosarcoma_counts_triceps_signature_profyle.tsv"
PATH_META_SAMPLE = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/files/sample_meta.tsv"
PATH_META_PATIENT = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/files/patient_meta.tsv"
meta_pat = data.table::fread(PATH_META_PATIENT) %>% arrange(Patient_N)
meta_pat
meta_samp = data.table::fread(PATH_META_SAMPLE) 
set.seed(321)
meta = full_join(meta_samp, meta_pat, by = 'Patient_N') %>%
       mutate(Response = tolower(Response), 
              Primary_location = tolower(Primary_location)) %>%
  filter(!is.na(RNA_triceps_signature_profyle)) %>%
  filter(!is.na(Patient_N)) %>%
  group_by(Patient_N) %>%
  sample_n(1)
exp = data.table::fread(PATH_DATA) %>%
      column_to_rownames('ensembl_id') 
#rename duplicated samples 
x = sapply((exp %>% names %>% strsplit(split = '_')),"[[",2)

x[x == "PRO-00316"][1] = "PRO-00316_A"
x[x == "PRO-00316"][1] = "PRO-00316_B"
x[x == "TCMG198"][1] = "TCMG198_A"
x[x == "TCMG198"][1] = "TCMG198_B"
x[x == "TCMG198"][1] = "TCMG198_C"
x[x == "SISJ0252"][1] = "SISJ0252_A"
x[x == "SISJ0252"][1] = "SISJ0252_B"

names(exp) = x
exp = exp %>%
  select(meta$RNA_ID_triceps_signature_profyle)

exp
meta %>%
  select(EZHIP_Status, Patient_N) %>%
  distinct() %>%
  dplyr::count(EZHIP_Status) %>%
  ggblanket::gg_bar(x = EZHIP_Status, y = n, stat = 'identity')

# Normalize gene expression

meta = meta[match(colnames(exp), meta$RNA_ID_triceps_signature_profyle),]
rownames(meta) = meta$RNA_ID_triceps_signature_profyle

rownames(meta) == colnames(exp)
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE
dds = DESeqDataSetFromMatrix(countData=exp, 
                              colData=meta, 
                              design=~1)
dds = DESeq(dds)
EZHIP = counts(dds, normalized=T) %>% 
  as.data.frame() %>% 
  dplyr::filter(rownames(.) == 'ENSG00000187690') %>% 
  t %>% 
  as.data.frame() %>%
  base::merge(colData(dds) %>% as.data.frame() %>% dplyr::select(EZHIP_Status), by = "row.names") %>%
  mutate(EZHIP = log2(ENSG00000187690+1)) 

EZHIP %>% arrange(desc(EZHIP))
EZHIP %>%
  ggblanket::gg_boxplot(x = EZHIP_Status, y = EZHIP, y_title = 'EZHIP log2 normalized expression', size = 2) +
  geom_point(data = EZHIP, aes(x = EZHIP_Status, y = EZHIP))

EZHIP = EZHIP %>% dplyr::filter(EZHIP_Status == 'positive' | (EZHIP_Status == 'negative' & EZHIP == 0))

meta = meta %>% dplyr::filter(RNA_ID_triceps_signature_profyle %in% EZHIP$Row.names)
exp = exp %>% dplyr::select(meta$RNA_ID_triceps_signature_profyle)
rownames(meta) == colnames(exp)
##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
meta %>%
  dplyr::count(EZHIP_Status, Sex)
exp = exp %>%
  rownames_to_column('ID')

geneIDs1 = ensembldb::select(EnsDb.Hsapiens.v75, keys= exp$ID, keytype = "GENEID", columns = c("SYMBOL","GENEID")) %>%
           mutate(ENS_SYM = paste0(GENEID,':', SYMBOL))


exp = exp %>% merge(geneIDs1, by.x = 'ID', by.y = 'GENEID') %>%
  column_to_rownames('ENS_SYM') %>%
  dplyr::select(!c(ID, SYMBOL))
dds = DESeqDataSetFromMatrix(countData=exp, 
                              colData=meta, 
                              design=~EZHIP_Status)
dds = DESeq(dds)
res = results(dds) %>%
  as.data.frame()

res %>%
  rownames_to_column('ID') %>%
  dplyr::filter(grepl('PAGE', ID)) %>%
  arrange(padj)

0.1 PCA

var = counts(dds, normalized=TRUE) %>%
  as.data.frame() %>%
  apply(1,var) %>%
  as.data.frame() %>%
  top_frac(n = 0.5, .)
pca = counts(dds, normalized=TRUE) %>%
  as.data.frame() %>%
  rownames_to_column('gene_id') %>%
  dplyr::filter(gene_id %in% rownames(var)) %>%
  column_to_rownames('gene_id') %>%
  t() %>%
  as.data.frame() %>%
  prcomp()
fviz_eig(pca)

cowplot::plot_grid(fviz_pca_ind(pca,
             axes = c(1, 2),
             habillage = colData(dds)$EZHIP_Status, # color by groups
             palette = c("#067BC2", "#D56062", "#FFD670"),
             addEllipses = TRUE, # Concentration ellipses
             ellipse.type = "confidence",
             legend.title = "Groups",
             repel = TRUE,
             geom='point'),
fviz_pca_ind(pca,
             axes = c(2, 3),
             habillage = colData(dds)$EZHIP_Status, # color by groups
             palette = c("#067BC2", "#D56062", "#FFD670"),
             addEllipses = TRUE, # Concentration ellipses
             ellipse.type = "confidence",
             legend.title = "Groups",
             repel = TRUE,
             geom='point'),
fviz_pca_ind(pca,
             axes = c(3, 4),
             habillage = colData(dds)$EZHIP_Status, # color by groups
             palette = c("#067BC2", "#D56062", "#FFD670"),
             addEllipses = TRUE, # Concentration ellipses
             ellipse.type = "confidence",
             legend.title = "Groups",
             repel = TRUE,
             geom='point'
             ),
fviz_pca_ind(pca,
             axes = c(4, 5),
             habillage = colData(dds)$EZHIP_Status, # color by groups
             palette = c("#067BC2", "#D56062", "#FFD670"),
             addEllipses = TRUE, # Concentration ellipses
             ellipse.type = "confidence",
             legend.title = "Groups",
             repel = TRUE,
             geom='point')
)

1 DEG results

results(dds) %>%
  as.data.frame() %>%
  rownames_to_column('ID') %>%
  separate('ID', into = c('ENS', 'SYMBOL'), sep = ':') %>%
  prep_vulcano_plot_data(., bm = 0) %>%
  filter(!diffexpressed == 'NO')
results(dds) %>%
  as.data.frame() %>%
  rownames_to_column('ID') %>%
  separate('ID', into = c('ENS', 'SYMBOL'), sep = ':') %>%
  prep_vulcano_plot_data(., bm = 25) %>%
  vulcano_plot(., title = 'c', label_top_n = 50) +
  theme(legend.position = 'none', 
        plot.title = element_blank())

# GSEA

gene_list_tmp = res %>%
  filter(!is.na(padj)) %>%
  rownames_to_column('ID') %>%
  separate(ID, into = c('ENS')) %>%
  dplyr::select(ENS, log2FoldChange) %>%
  arrange(desc(log2FoldChange))

gene_list = gene_list_tmp$log2FoldChange
names(gene_list) = gene_list_tmp$ENS
GO = run_GSEA(gene_list,
              cat="C5",
              subcat="BP", 
              pval = 0.05)

GO %>% as.data.frame()
dotplot(GO, showCategory = 30, x = 'NES') +
  scale_colour_gradient2(high = "#E3E9C2", mid = "#E53D00") +
  coord_cartesian(xlim = c(-2, 2))